import numpy as np


def step_function(x):
    """
    阶跃函数
    """
    return np.array(x > 0, dtype=np.int32)


def sigmoid(x):
    """
    sigmoid函数
    """
    return 1 / (1 + np.exp(-x))


def sigmoid_grad(x):
    """
    sigmoid函数梯度
    """
    return (1.0 - sigmoid(x)) * sigmoid(x)


def relu(x):
    """
    ReLU函数
    """
    return np.maximum(0, x)


def relu_grad(x):
    """
    ReLU函数梯度
    """
    grad = np.zeros(x)
    grad[x >= 0] = 1
    return grad


def softmax(x):
    """
    softmax函数
    """
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T
    x = x - np.max(x)  # 溢出对策
    return np.exp(x) / np.sum(np.exp(x))


def mean_squared_error(y_hot, y):
    """
    均方误差
    """
    return np.sum((y_hot - y) ** 2) * 0.5


def cross_entropy_error(y_hot, y):
    """
    交叉熵误差
    """
    if y_hot == 1:
        y = y.reshape(1, y.size)
        y_hot = y_hot.reshape(1, y_hot.size)
    # 监督数据是one-hot-vector的情况下，转换为正确解标签的索引
    if y.size == y_hot.size:
        y = y.argmax(axis=1)
    batch_size = y_hot.shape[0]
    return -np.sum(np.log(y_hot[np.arange(batch_size), y] + 1e-7)) / batch_size


def softmax_loss(x, t):
    """
    softmax函数损失
    """
    y = softmax(x)
    return cross_entropy_error(y, t)


def numerical_diff(f, x):
    """
    数值微分
    """
    h = 1e-4  # 0.0001
    return (f(x + h) - f(x - h)) / (2 * h)


def _numerical_gradient_no_batch(f, x):
    """
    梯度
    """
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)  # 生成与x形状相同的数组
    for idx in range(x.size):
        tmp_val = x[idx]
        # f(x + h)
        x[idx] = tmp_val + h
        fxh1 = f(x)

        # f(x - h)
        x[idx] = tmp_val - h
        fxh2 = f(x)

        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val  # 还原值
    return grad


def numerical_gradient(f, x):
    """
    梯度
    """
    if x.ndim == 1:
        return _numerical_gradient_no_batch(f, x)
    else:
        grad = np.zeros_like(x)
        for idx, i in enumerate(x):
            grad[idx] = _numerical_gradient_no_batch(f, i)
        return grad


def gradient_descent(f, init_x, lr=0.01, step_num=100):
    """
    梯度下降 
    f 是要进行最优化的函数
    init_x 是初始值
    lr 是学习率
    step_num 是重复次数
    """
    x = init_x
    x_history = []
    for _ in range(step_num):
        x_history.append(x.copy())
        grad = numerical_gradient(f, x)
        x -= lr * grad
    return x, np.array(x_history)
